Link to recording: https://smu.zoom.us/rec/share/Gujqwi30bZtZpix1Wlj58pEX7dx37r-WDR0gRYhTein89fP_C1e9dtbBbIxzKQOd.slY31p9lFCdaozCi?startTime=1666477030000 Passcode: FST@4!kH
Load in data
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.7 v dplyr 1.0.9
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'forcats' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
beers <- data.frame(read.csv('Beers.csv', stringsAsFactors = FALSE))
breweries <- data.frame(read.csv('Breweries.csv', stringsAsFactors = FALSE))
num_breweries <- breweries %>% ggplot(aes(x = State, fill = State)) + geom_bar()+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
geom_text(stat='count', aes(label=..count..), vjust=-1)
num_breweries
Colorado has the largest amount of breweries.
head(beers)
head(breweries)
breweries <- breweries %>% rename("Brewery_id" = "Brew_ID")
bb <- merge(beers, breweries, by="Brewery_id", all=TRUE)
bb <- bb %>% rename("Beer" = "Name.x") %>% rename("Brewery" = "Name.y")
print(head(bb))
## Brewery_id Beer Beer_ID ABV IBU
## 1 1 Get Together 2692 0.045 50
## 2 1 Maggie's Leap 2691 0.049 26
## 3 1 Wall's End 2690 0.048 19
## 4 1 Pumpion 2689 0.060 38
## 5 1 Stronghold 2688 0.060 25
## 6 1 Parapet ESB 2687 0.056 47
## Style Ounces Brewery City
## 1 American IPA 16 NorthGate Brewing Minneapolis
## 2 Milk / Sweet Stout 16 NorthGate Brewing Minneapolis
## 3 English Brown Ale 16 NorthGate Brewing Minneapolis
## 4 Pumpkin Ale 16 NorthGate Brewing Minneapolis
## 5 American Porter 16 NorthGate Brewing Minneapolis
## 6 Extra Special / Strong Bitter (ESB) 16 NorthGate Brewing Minneapolis
## State
## 1 MN
## 2 MN
## 3 MN
## 4 MN
## 5 MN
## 6 MN
print(tail(bb))
## Brewery_id Beer Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Brewery City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
summary(bb)
## Brewery_id Beer Beer_ID ABV
## Min. : 1.0 Length:2410 Min. : 1.0 Min. :0.00100
## 1st Qu.: 94.0 Class :character 1st Qu.: 808.2 1st Qu.:0.05000
## Median :206.0 Mode :character Median :1453.5 Median :0.05600
## Mean :232.7 Mean :1431.1 Mean :0.05977
## 3rd Qu.:367.0 3rd Qu.:2075.8 3rd Qu.:0.06700
## Max. :558.0 Max. :2692.0 Max. :0.12800
## NA's :62
## IBU Style Ounces Brewery
## Min. : 4.00 Length:2410 Min. : 8.40 Length:2410
## 1st Qu.: 21.00 Class :character 1st Qu.:12.00 Class :character
## Median : 35.00 Mode :character Median :12.00 Mode :character
## Mean : 42.71 Mean :13.59
## 3rd Qu.: 64.00 3rd Qu.:16.00
## Max. :138.00 Max. :32.00
## NA's :1005
## City State
## Length:2410 Length:2410
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
# bb[is.na(bb)] <- 0
library(plyr)
## Warning: package 'plyr' was built under R version 4.1.3
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
impute.mean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
bb <- ddply(bb, ~ Style, transform, ABV = impute.mean(ABV), IBU = impute.mean(IBU))
bb[is.na(bb)] <- 0
summary(bb)
## Brewery_id Beer Beer_ID ABV
## Min. : 1.0 Length:2410 Min. : 1.0 Min. :0.00100
## 1st Qu.: 94.0 Class :character 1st Qu.: 808.2 1st Qu.:0.05000
## Median :206.0 Mode :character Median :1453.5 Median :0.05650
## Mean :232.7 Mean :1431.1 Mean :0.05975
## 3rd Qu.:367.0 3rd Qu.:2075.8 3rd Qu.:0.06700
## Max. :558.0 Max. :2692.0 Max. :0.12800
## IBU Style Ounces Brewery
## Min. : 0.00 Length:2410 Min. : 8.40 Length:2410
## 1st Qu.: 21.00 Class :character 1st Qu.:12.00 Class :character
## Median : 33.41 Mode :character Median :12.00 Mode :character
## Mean : 39.98 Mean :13.59
## 3rd Qu.: 57.00 3rd Qu.:16.00
## Max. :138.00 Max. :32.00
## City State
## Length:2410 Length:2410
## Class :character Class :character
## Mode :character Mode :character
##
##
##
library(ggplot2)
bb %>% ggplot(aes(x = as.factor(State), y = ABV, fill = State)) + geom_bar(stat = "summary", fun = "median") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Median ABV")
bb %>% ggplot(aes(x = as.factor(State), y = IBU, fill = State)) + geom_bar(stat = "summary", fun = "median") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Median IBU")
ME has both the highest median ABV and the highest median IBU of all the states. There is a lot of overlap between median ABV and median IBU.
paste0("Max ABV State:", bb$State[which.max(bb$ABV)])
## [1] "Max ABV State: CO"
bb %>% slice_max(ABV)
paste0("Max IBU State:", bb$State[which.max(bb$IBU)])
## [1] "Max IBU State: OR"
The max ABV states is Colorado, and the max IBU state is Oregon.
summary(bb$ABV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00100 0.05000 0.05650 0.05975 0.06700 0.12800
bb %>% ggplot(aes(x = ABV)) + geom_histogram() + ggtitle("Histogram of ABV")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The min is .1%, the max is 12.8%, and the median is 5.65%. This shows that most Americans prefer beer than is around the 5%-6% ABV. 7. Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer.
bb %>% ggplot(aes(x = IBU, y = ABV)) + geom_smooth(method = "lm") + geom_point() + ggtitle("ABV vs IBU")
## `geom_smooth()` using formula 'y ~ x'
The linear regression line shows a clear correlation between ABV and IBU. As ABV increases, so does IBU.
unique(bb$Style)
## [1] ""
## [2] "Abbey Single Ale"
## [3] "Altbier"
## [4] "American Adjunct Lager"
## [5] "American Amber / Red Ale"
## [6] "American Amber / Red Lager"
## [7] "American Barleywine"
## [8] "American Black Ale"
## [9] "American Blonde Ale"
## [10] "American Brown Ale"
## [11] "American Dark Wheat Ale"
## [12] "American Double / Imperial IPA"
## [13] "American Double / Imperial Pilsner"
## [14] "American Double / Imperial Stout"
## [15] "American India Pale Lager"
## [16] "American IPA"
## [17] "American Malt Liquor"
## [18] "American Pale Ale (APA)"
## [19] "American Pale Lager"
## [20] "American Pale Wheat Ale"
## [21] "American Pilsner"
## [22] "American Porter"
## [23] "American Stout"
## [24] "American Strong Ale"
## [25] "American White IPA"
## [26] "American Wild Ale"
## [27] "Baltic Porter"
## [28] "Belgian Dark Ale"
## [29] "Belgian IPA"
## [30] "Belgian Pale Ale"
## [31] "Belgian Strong Dark Ale"
## [32] "Belgian Strong Pale Ale"
## [33] "Berliner Weissbier"
## [34] "Bière de Garde"
## [35] "Bock"
## [36] "Braggot"
## [37] "California Common / Steam Beer"
## [38] "Chile Beer"
## [39] "Cider"
## [40] "Cream Ale"
## [41] "Czech Pilsener"
## [42] "Doppelbock"
## [43] "Dortmunder / Export Lager"
## [44] "Dubbel"
## [45] "Dunkelweizen"
## [46] "English Barleywine"
## [47] "English Bitter"
## [48] "English Brown Ale"
## [49] "English Dark Mild Ale"
## [50] "English India Pale Ale (IPA)"
## [51] "English Pale Ale"
## [52] "English Pale Mild Ale"
## [53] "English Stout"
## [54] "English Strong Ale"
## [55] "Euro Dark Lager"
## [56] "Euro Pale Lager"
## [57] "Extra Special / Strong Bitter (ESB)"
## [58] "Flanders Oud Bruin"
## [59] "Flanders Red Ale"
## [60] "Foreign / Export Stout"
## [61] "Fruit / Vegetable Beer"
## [62] "German Pilsener"
## [63] "Gose"
## [64] "Grisette"
## [65] "Hefeweizen"
## [66] "Herbed / Spiced Beer"
## [67] "Irish Dry Stout"
## [68] "Irish Red Ale"
## [69] "Kölsch"
## [70] "Keller Bier / Zwickel Bier"
## [71] "Kristalweizen"
## [72] "Light Lager"
## [73] "Low Alcohol Beer"
## [74] "Märzen / Oktoberfest"
## [75] "Maibock / Helles Bock"
## [76] "Mead"
## [77] "Milk / Sweet Stout"
## [78] "Munich Dunkel Lager"
## [79] "Munich Helles Lager"
## [80] "Oatmeal Stout"
## [81] "Old Ale"
## [82] "Other"
## [83] "Pumpkin Ale"
## [84] "Quadrupel (Quad)"
## [85] "Radler"
## [86] "Rauchbier"
## [87] "Roggenbier"
## [88] "Russian Imperial Stout"
## [89] "Rye Beer"
## [90] "Saison / Farmhouse Ale"
## [91] "Schwarzbier"
## [92] "Scotch Ale / Wee Heavy"
## [93] "Scottish Ale"
## [94] "Shandy"
## [95] "Smoked Beer"
## [96] "Tripel"
## [97] "Vienna Lager"
## [98] "Wheat Ale"
## [99] "Winter Warmer"
## [100] "Witbier"
bb$Class1 <- NA
for(i in 1:nrow(bb)){
if(grepl("IPA", bb$Style[i]) == TRUE | grepl("India Pale Ale", bb$Style[i]) == TRUE){
bb$Class1[i] <- "IPA"
} else if(grepl("Ale", bb$Style[i]) == TRUE & !grepl("India Pale Ale", bb$Style[i]) == TRUE & !grepl("IPA", bb$Style[i]) == TRUE){
bb$Class1[i] <- "Ale"
} else{
bb$Class1[i] <- NA
}
}
library(class)
## Warning: package 'class' was built under R version 4.1.3
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
set.seed(1234)
bb_knn_df <- na.omit(bb)
splitPerc <- 0.80
trainIndices <- sample(1:dim(bb_knn_df)[1], round(splitPerc * dim(bb_knn_df)[1]))
train <- bb_knn_df[trainIndices,]
test <- bb_knn_df[-trainIndices,]
train$ABV <- scale(train$ABV)
train$IBU <- scale(train$IBU)
test$ABV <- scale(test$ABV)
test$IBU <- scale(test$IBU)
accuracy <- c()
k <- c()
# Find maximum accuracy and its respective k value
for(i in 1:300){
classifications <- knn(train[, c(4, 5)], test[, c(4, 5)], train$Class1, prob = TRUE, k = i)
CM <- confusionMatrix(table(classifications, test$Class1))
accuracy[i] <- CM$overall[1]
k[i] <- i
if(i == 1){
max_k <- i
accuracy_max <- accuracy[i]
}else if(accuracy[i] >= accuracy_max){
max_k <- i
accuracy_max <- accuracy[i]
}
}
plot(k, accuracy, type = "l", xlab = "k")
print(paste0("Max k:", max_k))
## [1] "Max k:17"
print(paste0("Max accuracy:", accuracy[max_k]))
## [1] "Max accuracy:0.899022801302932"
classifications <- knn(train[, c(4, 5)], test[, c(4, 5)], train$Class1, prob = TRUE, k = max_k)
CM <- confusionMatrix(table(classifications, test$Class1))
print(CM)
## Confusion Matrix and Statistics
##
##
## classifications Ale IPA
## Ale 184 12
## IPA 19 92
##
## Accuracy : 0.899
## 95% CI : (0.8597, 0.9304)
## No Information Rate : 0.6612
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7782
##
## Mcnemar's Test P-Value : 0.2812
##
## Sensitivity : 0.9064
## Specificity : 0.8846
## Pos Pred Value : 0.9388
## Neg Pred Value : 0.8288
## Prevalence : 0.6612
## Detection Rate : 0.5993
## Detection Prevalence : 0.6384
## Balanced Accuracy : 0.8955
##
## 'Positive' Class : Ale
##
The accuracy is 0.8893. This is rather good for a KNN model. The sensitivity and specificity are also high. Also, the pos pred value is high meaning that there are a lot of true values that were predicted.
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
x_test <- test %>% select("ABV", "IBU")
y_test <- test %>% select("Class1")
yscore <- knn(train[, c(4, 5)], test[, c(4, 5)], train$Class1, prob = TRUE, k = max_k)
yscore <- attributes(yscore)$prob
pdb <- cbind(x_test, y_test)
pdb <- cbind(pdb, yscore)
fig <- plot_ly(data = pdb,x = ~IBU, y = ~ABV, type = 'scatter', mode = 'markers',color = ~yscore, colors = 'RdBu', symbol = ~Class1, split = ~Class1, symbols = c('square-dot','circle-dot'), marker = list(size = 12, line = list(color = 'black', width = 1)))
fig
The above plot shows the divisions between the IPAs and Ales graphically. The index shows the probability/the intesisty which the beer was categorized.
p3<-ggplot(train,aes(x=IBU,y=ABV,colour=Class1)) + geom_jitter() +geom_density2d() + ggtitle("Density Plot of ABV vs IBU")
p3
fit.IBU <- aov(IBU~Style, data = bb)
print("ANOVA IBU")
## [1] "ANOVA IBU"
summary(fit.IBU)
## Df Sum Sq Mean Sq F value Pr(>F)
## Style 99 1186184 11982 116.1 <2e-16 ***
## Residuals 2310 238303 103
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit.ABV <- aov(ABV~Style, data = bb)
print("ANOVA ABV")
## [1] "ANOVA ABV"
summary(fit.ABV)
## Df Sum Sq Mean Sq F value Pr(>F)
## Style 99 0.2754 0.0027822 40.15 <2e-16 ***
## Residuals 2310 0.1601 0.0000693
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There is significant evidence to prove that at least one Style has a different IBU, and one style has a different ABV (p-value < 2e-16).As a result, one can infer that different styles are characterized by different combinations of bitterness and alcohol level. The plot below shows that there are different ABV and IBU levels for each style of beer. More so, there appears to be a greater correlation between style and IBU than style and ABV.
bb %>% ggplot(aes(x = IBU, y = ABV, color = Style)) + geom_point() + theme(legend.position = "None") + ggtitle("ABV vs IBU by Style")